from bertopic import BERTopic
topic_model = BERTopic.load("/home/zhhuang/climate_policy_paper/code/model_save/bert_topic_iea_cp_cclw_model")
/home/zhhuang/anaconda3/envs/climatepolicy/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
import pandas as pd
df = pd.read_excel("/home/zhhuang/climate_policy_paper/code/data/Topic_docs_time_iea_cp_cclw.xlsx")
docs, timestamp = df["docs"].to_list(), df["Year"].to_list()
topic_model.get_topic_info()['Topic']
topic_model.get_document_info(docs)[["Topic", "Name", "Top_n_words", "Probability", "Representative_document"]]
| Topic | Name | Top_n_words | Probability | Representative_document | |
|---|---|---|---|---|---|
| 0 | -1 | -1_energy_project_development_support | energy - project - development - support - emi... | 0.132439 | False |
| 1 | -1 | -1_energy_project_development_support | energy - project - development - support - emi... | 0.077007 | False |
| 2 | -1 | -1_energy_project_development_support | energy - project - development - support - emi... | 0.695282 | False |
| 3 | -1 | -1_energy_project_development_support | energy - project - development - support - emi... | 0.125002 | False |
| 4 | 0 | 0_energy_climate_development_emission | energy - climate - development - emission - ma... | 0.327378 | False |
| ... | ... | ... | ... | ... | ... |
| 9883 | 28 | 28_transparent_generation_reliable_competition | transparent - generation - reliable - competit... | 1.000000 | False |
| 9884 | -1 | -1_energy_project_development_support | energy - project - development - support - emi... | 0.697941 | False |
| 9885 | 6 | 6_biofuels_biofuel_biodiesel_ethanol | biofuels - biofuel - biodiesel - ethanol - die... | 0.261042 | False |
| 9886 | 11 | 11_carbon_storage_energy_efficiency | carbon - storage - energy - efficiency - innov... | 0.209579 | False |
| 9887 | 0 | 0_energy_climate_development_emission | energy - climate - development - emission - ma... | 0.398140 | False |
9888 rows × 5 columns
counts = {}
for doc in docs:
for word in doc.split():
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(100):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))
energy 24258 renewable 5814 efficiency 5782 emission 5445 project 4847 plan 4715 development 4620 electricity 4287 system 3964 climate 3949 building 3882 sector 3818 power 3782 national 3731 support 3443 gas 3369 policy 3201 target 3009 measure 3008 technology 2991 vehicle 2941 reduce 2899 include 2861 fuel 2853 standard 2834 government 2781 change 2620 programme 2563 increase 2511 tax 2409 heat 2402 source 2389 set 2385 production 2301 promote 2260 public 2253 provide 2251 reduction 2230 investment 2174 strategy 2124 requirement 2094 consumption 2073 environmental 2049 solar 2008 establish 1996 capacity 1977 aim 1973 program 1942 sustainable 1918 management 1901 carbon 1885 industry 1881 level 1878 transport 1859 develop 1817 action 1814 resource 1807 generation 1801 cost 1738 electric 1708 `` 1695 market 1693 company 1665 improve 1658 fund 1650 plant 1636 air 1615 scheme 1607 wind 1600 product 1537 supply 1522 water 1499 green 1488 objective 1483 total 1466 build 1457 base 1443 equipment 1427 implementation 1427 implement 1409 service 1408 efficient 1403 country 1399 activity 1385 grant 1362 require 1357 achieve 1344 greenhouse 1317 goal 1288 ensure 1271 '' 1270 environment 1264 installation 1264 performance 1245 natural 1229 forest 1226 framework 1221 facility 1214 local 1181 tariff 1173
similar_topics, similarity = topic_model.find_topics("Transport", top_n=5)
topic_model.get_topic(similar_topics[0])
[('transparent', 0.07180454680384146),
('generation', 0.03550348292371022),
('reliable', 0.03417482361218522),
('competition', 0.03389988982837627),
('electrical', 0.02941211389437091),
('flexible', 0.02925387541205265),
('transmission', 0.028266950447326415),
('affordable', 0.0254538800551447),
('financially', 0.02472697203215678),
('wholesale', 0.02128862307104659)]
similar_topics, similarity = topic_model.find_topics("Industry", top_n=5)
topic_model.get_topic(similar_topics[0])
[('vehicle', 0.09872622440931626),
('fuel', 0.03216303864189669),
('emission', 0.02477394343334291),
('mobility', 0.021767777207266862),
('passenger', 0.020463693943990093),
('purchase', 0.01845449245252215),
('hybrid', 0.017259508077390077),
('truck', 0.01231378011249428),
('government', 0.011820709804332022),
('battery', 0.010923226473897669)]
similar_topics, similarity = topic_model.find_topics("Energy systems", top_n=5)
topic_model.get_topic(similar_topics[0])
[('pv', 0.10658574246167805),
('photovoltaic', 0.06793013138935346),
('certification', 0.057393732829111865),
('geothermal', 0.03734211062109081),
('photovoltaics', 0.031011521070597656),
('manufacturing', 0.02982326407303994),
('irradiance', 0.026549480326347224),
('manufacture', 0.025380053496348042),
('intelligent', 0.024372310694128513),
('generation', 0.02249437089718655)]
similar_topics, similarity = topic_model.find_topics("Buildings", top_n=5)
topic_model.get_topic(similar_topics[0])
[('building', 0.11855952002243023),
('residential', 0.044670908054186095),
('requirement', 0.038122615955129904),
('energy', 0.03736539585568927),
('certificate', 0.026989005080772135),
('mandatory', 0.017764517473103085),
('renovation', 0.016992133155504307),
('insulation', 0.01656538204714158),
('certification', 0.015874596419938086),
('measure', 0.012512507170089183)]
similar_topics, similarity = topic_model.find_topics("Agriculture, Forestry and Other Land Use", top_n=5)
topic_model.get_topic(similar_topics[0])
[('petroleum', 0.16396635427164843),
('exploitation', 0.12613048244983324),
('exploration', 0.12022723651321504),
('regulate', 0.06417742232836783),
('geothermal', 0.04750483692936236),
('prohibit', 0.03577380577199707),
('hydrocarbon', 0.031090741237797447),
('formally', 0.028261385335496114),
('hydraulic', 0.027679243464891794),
('establishes', 0.02407894665799679)]
len(docs)
9888
import os
images_path = "/home/zhhuang/climate_policy_paper/paper_images"
if not os.path.exists(images_path):
os.makedirs(images_path)
import plotly.io as pio
pio.kaleido.scope.default_format = "svg"
# pio.kaleido.scope.mathjax = "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js
fig = topic_model.visualize_barchart(top_n_topics=30, n_words=20, width=300, height=300)
pio.write_image(fig, '/home/zhhuang/climate_policy_paper/paper_images/topic_iea_cp_cclw_barchart.svg')
# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
# fig.write_html("/home/zhhuang/climate_policy_paper/paper_images/topic_barchart.png", engine="kaleido")
# img_bytes = fig.to_image(format="png", width=600, height=350, scale=2)
# Image(img_bytes)
fig
# topic_model.visualize_barchart(top_n_topics = 20, n_words=10,width = 300, height= 300)
fig2 = topic_model.visualize_heatmap()
# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
pio.write_image(fig2, '/home/zhhuang/climate_policy_paper/paper_images/topic_iea_cp_cclw_heatmap.svg')
fig2
fig3 = topic_model.visualize_topics()
pio.write_image(fig3, '/home/zhhuang/climate_policy_paper/paper_images/topic_iea_cp_cclw_visualize_topics.svg')
fig3
hierarchical_topics = topic_model.hierarchical_topics(docs)
# print(hierarchical_topics)
with pd.ExcelWriter("Topic_iea_cp_cclw_hierarchical_topics.xlsx", engine='xlsxwriter',
engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
hierarchical_topics.to_excel(writer)
hierarchical_topics = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_iea_cp_cclw_hierarchical_topics.xlsx")
fig4 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
pio.write_image(fig4, '/home/zhhuang/climate_policy_paper/paper_images/topic_iea_cp_cclw_hierarchical_topics.svg')
fig4
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [25:58<00:00, 55.66s/it]
for index, i in enumerate(timestamp):
if i == '0':
timestamp[index] = '2020'
else:
timestamp[index] = str(i)
topics_over_time = topic_model.topics_over_time(docs, timestamp, datetime_format="%Y", nr_bins=20)
with pd.ExcelWriter("Topic_iea_cp_cclw_topics_over_time.xlsx", engine='xlsxwriter',
engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
topics_over_time.to_excel(writer)
19it [4:01:49, 763.67s/it]
topics_over_time = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_iea_cp_cclw_topics_over_time.xlsx")
# fig5 = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)
fig5 = topic_model.visualize_topics_over_time(topics_over_time)
pio.write_image(fig5, '/home/zhhuang/climate_policy_paper/paper_images/topic_iea_cp_cclw_visualize_topics_over_time.svg')
fig5